Open a terminal and run the following commands:
conda activate <ENV_NAME>
conda install -c conda-forge pandas-profiling -y
<ENV_NAME> refers to the name of the environment you'll be working with for this project! indicates a terminal command-c flag indicates the channel we're using to get the package-y flag indicates that we're confirming the installation of pandas-profiling and its dependencies a prioriWe're installing pandas-profiling through this method because Anaconda's default channel contains an outdated version of this package, whereas the channel conda-forge has an updated version.
# Remember: library imports are ALWAYS at the top of the script, no exceptions!
import sqlite3
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from math import ceil
# notice we are adding two new imports for visualizations; This was not here last class
from itertools import product
# from pandas_profiling import ProfileReport
%matplotlib inline
# for better resolution plots
%config InlineBackend.figure_format = 'retina' # optionally, you can change 'svg' to 'retina'
# Seeting seaborn style
sns.set()
The data we will be using through the pratical classes comes from a small relational database whose schema can be seen below:

# path to database
my_path = os.path.join("..", "data", "datamining.db")
# connect to the database
conn = sqlite3.connect(my_path)
# the query
query = """
select
age,
income,
frq,
rcn,
mnt,
clothes,
kitchen,
small_appliances,
toys,
house_keeping,
dependents,
per_net_purchase,
g.gender,
e.education,
m.status,
r.description
from customers as c
join genders as g on g.id = c.gender_id
join education_levels as e on e.id = c.education_id
join marital_status as m on m.id = c.marital_status_id
join recommendations as r on r.id = c.recommendation_id
order by c.id;
"""
df = pd.read_sql_query(query, conn)
Pandas user guide: https://pandas.pydata.org/pandas-docs/stable/user_guide/index.html
Pandas 10 min tutorial: https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html
# dataset head
df.head(10)
| age | income | frq | rcn | mnt | clothes | kitchen | small_appliances | toys | house_keeping | dependents | per_net_purchase | gender | education | status | description | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1946 | 90782 | 33 | 66 | 1402 | 37 | 5 | 44 | 10 | 3 | 0 | 19 | M | Graduation | Together | Take my money!! |
| 1 | 1936 | 113023 | 32 | 6 | 1537 | 55 | 1 | 38 | 4 | 2 | 0 | 9 | F | PhD | Divorced | Take my money!! |
| 2 | 1990 | 28344 | 11 | 69 | 44 | 32 | 19 | 24 | 1 | 24 | 1 | 59 | M | Graduation | Married | Kind of OK |
| 3 | 1955 | 93571 | 26 | 10 | 888 | 60 | 10 | 19 | 6 | 5 | 1 | 35 | F | Master | OK nice! | |
| 4 | 1955 | 91852 | 31 | 26 | 1138 | 59 | 5 | 28 | 4 | 4 | 1 | 34 | F | Graduation | Together | Take my money!! |
| 5 | 1982 | 22386 | 14 | 65 | 56 | 47 | 2 | 48 | 2 | 1 | 1 | 67 | M | PhD | Single | OK nice! |
| 6 | 1969 | 69485 | 18 | 73 | 345 | 71 | 7 | 13 | 1 | 8 | 1 | 46 | M | Graduation | Together | OK nice! |
| 7 | 1960 | 68602 | 5 | 44 | 41 | 84 | 1 | 12 | 2 | 0 | 1 | 37 | M | Graduation | Together | Horrible |
| 8 | 1940 | 109499 | 30 | 75 | 1401 | 38 | 9 | 35 | 9 | 9 | 0 | 17 | M | Graduation | Divorced | OK nice! |
| 9 | 1994 | 23846 | 8 | 153 | 19 | 18 | 55 | 17 | 10 | 1 | 1 | 39 | F | 1st Cycle | Together | Meh... |
# dataset data types
df.dtypes
age int64 income object frq int64 rcn int64 mnt int64 clothes int64 kitchen int64 small_appliances int64 toys int64 house_keeping int64 dependents object per_net_purchase int64 gender object education object status object description object dtype: object
# count of missing values
df.isna().sum()
age 0 income 0 frq 0 rcn 0 mnt 0 clothes 0 kitchen 0 small_appliances 0 toys 0 house_keeping 0 dependents 0 per_net_purchase 0 gender 0 education 0 status 0 description 0 dtype: int64
# duplicated observations
df.duplicated().sum()
0
# descriptive statistics
df.describe(include="all").T # try with all and without all
| count | unique | top | freq | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| age | 8998 | NaN | NaN | NaN | 1966.06 | 17.2966 | 1936 | 1951 | 1966 | 1981 | 1996 |
| income | 8998 | 8525 | 46 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | |
| frq | 8998 | NaN | NaN | NaN | 19.8481 | 10.9034 | 3 | 10 | 17 | 28 | 59 |
| rcn | 8998 | NaN | NaN | NaN | 62.4698 | 69.7618 | 0 | 26 | 53 | 79 | 549 |
| mnt | 8998 | NaN | NaN | NaN | 622.163 | 646.768 | 6 | 63 | 383 | 1076 | 3052 |
| clothes | 8998 | NaN | NaN | NaN | 50.4467 | 23.4222 | 1 | 33 | 51 | 69 | 99 |
| kitchen | 8998 | NaN | NaN | NaN | 7.03968 | 7.84814 | 0 | 2 | 4 | 10 | 75 |
| small_appliances | 8998 | NaN | NaN | NaN | 28.5241 | 12.5864 | 1 | 19 | 28 | 37 | 74 |
| toys | 8998 | NaN | NaN | NaN | 7.0369 | 7.92442 | 0 | 2 | 4 | 10 | 62 |
| house_keeping | 8998 | NaN | NaN | NaN | 6.92998 | 7.88266 | 0 | 2 | 4 | 9 | 77 |
| dependents | 8998 | 3 | 1 | 6164 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| per_net_purchase | 8998 | NaN | NaN | NaN | 42.429 | 18.4957 | 4 | 28 | 45 | 57 | 88 |
| gender | 8998 | 2 | M | 5784 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| education | 8998 | 7 | Graduation | 4429 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| status | 8998 | 7 | Married | 3273 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| description | 8998 | 5 | OK nice! | 3434 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
# Use this cell to further explore the dataset
# CODE HERE
# replace "" by nans
df.replace("", np.nan, inplace=True)
# count of missing values
df.isna().sum()
age 0 income 46 frq 0 rcn 0 mnt 0 clothes 0 kitchen 0 small_appliances 0 toys 0 house_keeping 0 dependents 282 per_net_purchase 0 gender 0 education 47 status 177 description 0 dtype: int64
# check dataset data types again
df.dtypes
age int64 income float64 frq int64 rcn int64 mnt int64 clothes int64 kitchen int64 small_appliances int64 toys int64 house_keeping int64 dependents float64 per_net_purchase int64 gender object education object status object description object dtype: object
# fix wrong dtypes
# df.dependents = df.dependents.astype(bool)
# in the future, this must only be done after dealing with missing values
# (changing a a variable type from float to boolean converts nans to True!)
# check descriptive statistics again
df.describe(include="all").T
| count | unique | top | freq | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| age | 8998 | NaN | NaN | NaN | 1966.06 | 17.2966 | 1936 | 1951 | 1966 | 1981 | 1996 |
| income | 8952 | NaN | NaN | NaN | 69963.6 | 27591.6 | 10000 | 47741 | 70030.5 | 92218 | 140628 |
| frq | 8998 | NaN | NaN | NaN | 19.8481 | 10.9034 | 3 | 10 | 17 | 28 | 59 |
| rcn | 8998 | NaN | NaN | NaN | 62.4698 | 69.7618 | 0 | 26 | 53 | 79 | 549 |
| mnt | 8998 | NaN | NaN | NaN | 622.163 | 646.768 | 6 | 63 | 383 | 1076 | 3052 |
| clothes | 8998 | NaN | NaN | NaN | 50.4467 | 23.4222 | 1 | 33 | 51 | 69 | 99 |
| kitchen | 8998 | NaN | NaN | NaN | 7.03968 | 7.84814 | 0 | 2 | 4 | 10 | 75 |
| small_appliances | 8998 | NaN | NaN | NaN | 28.5241 | 12.5864 | 1 | 19 | 28 | 37 | 74 |
| toys | 8998 | NaN | NaN | NaN | 7.0369 | 7.92442 | 0 | 2 | 4 | 10 | 62 |
| house_keeping | 8998 | NaN | NaN | NaN | 6.92998 | 7.88266 | 0 | 2 | 4 | 9 | 77 |
| dependents | 8716 | NaN | NaN | NaN | 0.707205 | 0.455071 | 0 | 0 | 1 | 1 | 1 |
| per_net_purchase | 8998 | NaN | NaN | NaN | 42.429 | 18.4957 | 4 | 28 | 45 | 57 | 88 |
| gender | 8998 | 2 | M | 5784 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| education | 8951 | 6 | Graduation | 4429 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| status | 8821 | 6 | Married | 3273 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| description | 8998 | 5 | OK nice! | 3434 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Matplotlib tutorials: https://matplotlib.org/3.3.1/tutorials/index.html
Matplotlib gallery: https://matplotlib.org/3.3.1/tutorials/introductory/sample_plots.html#sphx-glr-tutorials-introductory-sample-plots-py
Seaborn tutorials: https://seaborn.pydata.org/tutorial.html
Seaborn gallery: https://seaborn.pydata.org/examples/index.html
More examples for visualizing distributions:
#Define metric and non-metric features. Why?
non_metric_features = ["education", "status", "gender", "dependents", "description"]
metric_features = df.columns.drop(non_metric_features).to_list()
# Single Metric Variable Histogram
plt.hist(df["age"], bins=10) # mess around with the bins
plt.title("age", y=-0.2)
plt.show()
# Single Metric Variable Box Plot
sns.boxplot(y=df["age"])
plt.show()
What information can we extract from the plots above?
# All Numeric Variables' Histograms in one figure
sns.set()
# Prepare figure. Create individual axes where each histogram will be placed
fig, axes = plt.subplots(2, ceil(len(metric_features) / 2), figsize=(20, 11))
# Plot data
# Iterate across axes objects and associate each histogram (hint: use the ax.hist() instead of plt.hist()):
for ax, feat in zip(axes.flatten(), metric_features): # Notice the zip() function and flatten() method
ax.hist(df[feat])
ax.set_title(feat, y=-0.13)
# Layout
# Add a centered title to the figure:
title = "Numeric Variables' Histograms"
plt.suptitle(title)
plt.savefig(os.path.join('..', 'figures', 'exp_analysis', 'numeric_variables_histograms.png'), dpi=200)
plt.show()
# All Numeric Variables' Box Plots in one figure
sns.set()
# Prepare figure. Create individual axes where each box plot will be placed
fig, axes = plt.subplots(2, ceil(len(metric_features) / 2), figsize=(20, 11))
# Plot data
# Iterate across axes objects and associate each box plot (hint: use the ax argument):
for ax, feat in zip(axes.flatten(), metric_features): # Notice the zip() function and flatten() method
sns.boxplot(x=df[feat], ax=ax)
# Layout
# Add a centered title to the figure:
title = "Numeric Variables' Box Plots"
plt.suptitle(title)
plt.savefig(os.path.join('..', 'figures', 'exp_analysis', 'numeric_variables_boxplots.png'), dpi=200)
plt.show()
# Single Metric Variable Scatter plot
plt.scatter(df["age"], df["income"], edgecolors="white")
plt.xlabel("age")
plt.ylabel("income")
plt.show()
# Pairwise Relationship of Numerical Variables
sns.set()
# Setting pairplot
sns.pairplot(df[metric_features], diag_kind="hist")
# Layout
plt.subplots_adjust(top=0.95)
plt.suptitle("Pairwise Relationship of Numerical Variables", fontsize=20)
plt.savefig(os.path.join('..', 'figures', 'exp_analysis', 'pairwise_relationship_of_numerical_variables.png'), dpi=200)
plt.show()
# making a joint plot with default formatting
sns.jointplot(data=df, x="house_keeping", y="frq")
plt.show()
# Making the same visualization with customized formatting
sns.set(style="ticks")
sns.jointplot(data=df, x="house_keeping", y="frq", kind="hex", color="red")
plt.show()
# Single Non-Metric variable bar plot
sns.set() # this resets our formatting defaults
sns.countplot(x=df["education"])
plt.show()
# formatting the color of a simple bar chart
sns.countplot(x=df["education"], color='#007acc')
# try replacing the color to 'red' or 'blue', instead of using an RGB code.
# alterntively, you can get the RGB code for a given color here:
# https://www.w3schools.com/colors/colors_picker.asp
# keep in mind any other color picker will do just as well
plt.show()
What information can we extract from the plot above?
Using the same logic from the multiple box plot figure above, build a multiple bar plot figure for each non-metric variable:
# All Non-Metric Variables' Absolute Frequencies
sns.set()
# Prepare figure. Create individual axes where each bar plot will be placed
fig, axes = plt.subplots(2, ceil(len(non_metric_features) / 2), figsize=(20, 11))
# Plot data
# Iterate across axes objects and associate each bar plot (hint: use the ax argument):
for ax, feat in zip(axes.flatten(), non_metric_features): # Notice the zip() function and flatten() method
sns.countplot(df[feat], ax=ax, color='#007acc')
title = "Categorical/Low Cardinality Variables' Absolute Frequencies"
plt.suptitle(title)
plt.savefig(os.path.join('..', 'figures', 'exp_analysis', 'categorical_variables_frequecies.png'), dpi=200)
plt.show()
/Users/philippmetzger/anaconda3/envs/data_mining_lab/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn( /Users/philippmetzger/anaconda3/envs/data_mining_lab/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn( /Users/philippmetzger/anaconda3/envs/data_mining_lab/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn( /Users/philippmetzger/anaconda3/envs/data_mining_lab/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn( /Users/philippmetzger/anaconda3/envs/data_mining_lab/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
# Let's break this down, step by step (pandas plot - matplotlib behind)
sns.set()
df_counts = df\
.groupby(['description', 'dependents'])\
.size()\
.unstack()\
.plot.bar(stacked=True)
# Pairwise Relationship of Numerical Variables
sns.set()
# Setting pairplot
sns.pairplot(df[metric_features + ['gender']], diag_kind="hist", hue='gender')
# Layout
plt.subplots_adjust(top=0.95)
plt.suptitle("Pairwise Relationship of Numerical Variables", fontsize=20)
plt.show()